#include <stdio.h>
#include <stdlib.h>
#include "Encoding.h"
#include "Tools.h"
#include "Bits.h"

#define MAX_NUMBER_OF_CHUNKS 100

#define TAIL_OVERLAP 1000

int get1234Encoding(int *buffer,char *dna,int length,int withTerminationChar)
{	
	
	int i;
	char curr;

	for(i=0;i<length;i++)
	{
		curr=dna[i];

		switch (curr)
		{
		case 'a': case 'A':
				buffer[i]=1; 				
				break;
		case 'c': case 'C':
				buffer[i]=2;				
				break;
		case 'g': case 'G':
				buffer[i]=3;				
				break;
		case 't': case 'T':
				buffer[i]=4; 				
				break;
			default:
				buffer[i]=1; 				
				break;
		}
		
	}
	if(withTerminationChar)
		buffer[i-1]=0; 

	return 0;
}

int getLongBinaryEncoding(unsigned int *buffer, int *dnanumeric,int length, int *lengthInLongs,
									 int withterminationchar)
{
	int numberofdnacharsin1long=16;
		
	int i,j,m;
	unsigned int *bitsSequence=&buffer[0];
	*bitsSequence=0L;

	for(i=0,j=0,m=0;i<length;i++)
	{
		int currInt=dnanumeric[i];
		if(i>0 && i%numberofdnacharsin1long==0)
		{		
			j++;			
			bitsSequence=&buffer[j];
			*bitsSequence=0L;			
			m=0;
		}		
		
		
		if(currInt==2) //01
		{			
			setBit(bitsSequence,m+1);			
		}
		if(currInt==3) //10
		{			
			setBit(bitsSequence,m);			
		}
		if(currInt==4)  //11
		{
			setBit(bitsSequence,m);
			setBit(bitsSequence,m+1);	
		}
		m=m+2;		
	}
	
	*lengthInLongs=j+1;
	return 0;
}

int preprocessFolder(char *inputFolder, char *inputPrefix, int numOfFiles,
					 char *outputFolder, char *outputPrefix, int maxChunkSize)
{
	
	FILE *inputFile;
	int size;
	int result;
	int lengthInLongs=0;

	FILE *outputFile;
	int chunksCount=0;

	FILE *contentFile;
	char currinputfilename [MAX_PATH_LENGTH];
	char currinputcontentfilename [MAX_PATH_LENGTH];
	char currnumericfilename [MAX_PATH_LENGTH];
	char currbinaryfilename [MAX_PATH_LENGTH];
	char smallNumericFileNamePrefix[MAX_PATH_LENGTH];
	char smallBinaryFileNamePrefix[MAX_PATH_LENGTH];
	char *inputbuffer;
	int *numericOutput;
	unsigned int *binaryOutput;
	int i;
	int *chunkLengths;
	int *bitSeqLengths;
	int *fileNumbers;
	FileData descriptor[1];
	char lengthsFileName[MAX_PATH_LENGTH];
	char bitSequencesLengthsFileName[MAX_PATH_LENGTH];
	char fileNumbersFileName[MAX_PATH_LENGTH];
	int fileContentExist=0;

	sprintf(smallNumericFileNamePrefix,"%s%s_smallnumeric",outputFolder,outputPrefix);
	sprintf(smallBinaryFileNamePrefix,"%s%s_smallbinary",outputFolder,outputPrefix);

	inputbuffer=(char*) calloc (maxChunkSize, sizeof(char));
	if (inputbuffer == NULL) 
	{
		printf ("1. Memory error\n"); 
		return 1;
	}

	numericOutput=(int*) calloc (maxChunkSize, sizeof(int));
	if (numericOutput == NULL) 
	{
		printf ("2. Memory error\n"); 
		return 1;
	}

	binaryOutput=(unsigned int*) calloc (maxChunkSize/4+1, sizeof(unsigned int));
	if (binaryOutput == NULL) 
	{
		printf ("3. Memory error\n"); 
		return 1;
	}

	chunkLengths=(int*) calloc (numOfFiles, sizeof(int)); 
	if (chunkLengths == NULL) 
	{
		printf ("4. Memory error\n"); 
		return 1;
	}

	bitSeqLengths=(int*) calloc (numOfFiles, sizeof(int)); 
	if (bitSeqLengths == NULL) 
	{
		printf ("5. Memory error\n"); 
		return 1;
	}

	fileNumbers=(int*) calloc (numOfFiles, sizeof(int));
	if (fileNumbers == NULL) 
	{
		printf ("6. Memory error\n"); 
		return 1;
	}

	for(i=0;i<numOfFiles;i++)
	{
		sprintf(currinputfilename,"%s%s_%i",inputFolder,inputPrefix,i);
		if(!(inputFile= fopen ( currinputfilename , "rb" )))
		{
			printf("Could not open input DNA file \"%s\" \n", currinputfilename);
			return 1;
		}

		fseek (inputFile, 0, SEEK_END);
		size=ftell (inputFile);
		printf("reading file %i of length %d\n",i,size);
		rewind(inputFile);

		result=fread(inputbuffer,sizeof(char),size,inputFile);
		if(result!=size)
		{
			printf("Error reading input DNA file \"%s\" \n", currinputfilename);
			return 1;
		}

		sprintf(currinputcontentfilename,"%s%s%i_content",inputFolder,inputPrefix,i);
		if(!(contentFile= fopen ( currinputcontentfilename , "rb" )))
		{
			//printf("Could not open input content DNA file \"%s\" \n", currinputcontentfilename);
			fileContentExist=0;
		}
		else
		{
			fileContentExist=1;
		}

		if(!fileContentExist)
		{
			chunkLengths[i]=size;
			bitSeqLengths[i]=2*size;
		}
		else
		{
			fseek (contentFile, 0, SEEK_END);
			chunksCount=ftell (contentFile)/sizeof(FileData);
			printf("reading content file %i consisting of %i chunks\n",i,chunksCount);
			rewind(contentFile);

			fileNumbers[i]=i;
			if(chunksCount>1)
			{
				chunkLengths[i]=size;
				bitSeqLengths[i]=2*size;
			}
			else
			{
				bitSeqLengths[i]=2*size;
				result=fread(descriptor,sizeof(FileData),1,contentFile);
				if(result!=1)
				{
					printf("Error reading input content file \"%s\" \n", currinputcontentfilename);
					return 1;
				}

				chunkLengths[i]=descriptor[0].lengthToIndex;
			}
		}

		printf("Encoding file %i of total size %d, size to index=%d\n",i,bitSeqLengths[i]/2,chunkLengths[i]);
		
		sprintf(currnumericfilename,"%s_%i", smallNumericFileNamePrefix,i);
		sprintf(currbinaryfilename,"%s_%i", smallBinaryFileNamePrefix,i);

		outputFile = fopen(currnumericfilename, "wb");
		if(outputFile==NULL) 
		{
			printf("Error: can't create temporary file %s for writing encoded DNA.\n",currnumericfilename);
			return 2;
		}

		if(get1234Encoding(numericOutput,inputbuffer,size,0))
			return 1;

		result=fwrite(numericOutput, sizeof(int), size, outputFile);
		if(result!=size)
		{
			printf("Write error: not all numeric encoding was written.\n");
			return 1;
		}		
		
		fclose(outputFile);
		
		outputFile = fopen(currbinaryfilename, "wb");
		if(outputFile==NULL) 
		{
			printf("Error: can't create temporary file for writing binary encoded DNA.\n");
			return 2;
		}	
			
		if(getLongBinaryEncoding(binaryOutput,numericOutput,size,
			&lengthInLongs,0))
			return 1;
	
		result=fwrite(binaryOutput, sizeof(unsigned int), lengthInLongs, outputFile);
		if(result!=(lengthInLongs))
		{
			printf("Write error: not all binary encoded DNA was written.\n");
			return 1;
		}
			
		fclose(outputFile);
		fclose(inputFile);
	}


	//now write information about lengths and input files into files
	sprintf(lengthsFileName,"%s%s_lengths", outputFolder, outputPrefix);
	sprintf(bitSequencesLengthsFileName,"%s%s_binarylengths", outputFolder, outputPrefix);
	sprintf(fileNumbersFileName,"%s%s_filenumbers", outputFolder, outputPrefix);

	outputFile = fopen(lengthsFileName, "wb");
	if(outputFile==NULL) 
	{
		printf("Error: can't create file for chunkLengths writing.\n");
		return 1;
	}


	result=fwrite(chunkLengths, sizeof (int), numOfFiles, outputFile);
	if(result!=numOfFiles)
	{
		printf("Error: not all chunkLengths were written 1.\n");
		return 1;
	}

	fclose(outputFile);


	outputFile = fopen(bitSequencesLengthsFileName, "wb");
	if(outputFile==NULL) 
	{
		printf("Error: can't create file for bit Sequences Lengths writing.\n");
		return 1;
	}


	result=fwrite(bitSeqLengths, sizeof (int), numOfFiles, outputFile);
	if(result!=numOfFiles)
	{
		printf("Error: not all binary chunk Lengths were written 1.\n");
		return 1;
	}
	fclose(outputFile);

	outputFile = fopen(fileNumbersFileName, "wb");
	if(outputFile==NULL) 
	{
		printf("Error: can't create file for file numbers writing.\n");
		return 1;
	}

	result=fwrite(fileNumbers, sizeof (int), numOfFiles, outputFile);
	if(result!=numOfFiles)
	{
		printf("Error: not all file numbers are written 1.\n");
		return 1;
	}

	fclose(outputFile);

	return 0;
}

int main(int argc, char *argv[])
{

	char *inputfolder;
	char *inputprefix;
	char *outputfolder;
	char *outputprefix;
	int numoffiles;
	int maxfilesize;
	FILE *infofile;
	char infofilename[MAX_PATH_LENGTH];
	int info[3];

	if(argc<5)
	{
		printf("To run: encodeFolder <inputfilefolder> <inputfileprefix>  <outputfolder> <outputprefix>\n");
		return 1;
	}

	inputfolder=argv[1];
	inputprefix=argv[2];
	outputfolder=argv[3];
	outputprefix=argv[4];
	
	sprintf(infofilename,"%s%s_input_info", argv[1],argv[2]);
	//1. read info to compute min substript,max subscript and maxfile size
	if(!(infofile= fopen ( infofilename , "rb" )))
	{
		printf("Could not open input info file %s for reading \n",infofilename);
		return 1;
	}
	
	
	
	if(fread(info, sizeof(int), 3, infofile)!=3)
	{
		printf("Error reading input info \n");
		return 1;
	}
	fclose(infofile);
	numoffiles=info[0];
	maxfilesize=info[2]+1;
	
	if(preprocessFolder(inputfolder, inputprefix, numoffiles,
					 outputfolder, outputprefix, maxfilesize))
		return 1;
	
	return 0;
}
